{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CNN Text Classification with Keras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import csv\n",
    "from many_stop_words import get_stop_words\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import word_tokenize\n",
    "from keras import regularizers, initializers, optimizers, callbacks\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.utils.np_utils import to_categorical\n",
    "from keras.layers import Embedding\n",
    "from keras.layers import Dense, Input, Flatten\n",
    "from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout, LSTM, GRU, Bidirectional\n",
    "from keras.models import Model\n",
    "from keras import backend as K\n",
    "from keras.engine.topology import Layer, InputSpec"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read from dataset (`data.csv` file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "texts, labels = [], []\n",
    "with open('data.csv') as csvfile:\n",
    "    readCSV = csv.reader(csvfile, delimiter=',')\n",
    "    for row in readCSV:\n",
    "        texts.append(row[0])\n",
    "        labels.append(row[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Model Parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_NB_WORDS = 40000\n",
    "MAX_SEQUENCE_LENGTH = 30\n",
    "VALIDATION_SPLIT = 0.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = Tokenizer(num_words=MAX_NB_WORDS)\n",
    "tokenizer.fit_on_texts(texts)\n",
    "sequences = tokenizer.texts_to_sequences(texts)\n",
    "word_index = tokenizer.word_index\n",
    "print('Found %s unique tokens.' % len(word_index))\n",
    "data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "labels = to_categorical(np.asarray(labels)) # convert to one-hot encoding vectors\n",
    "print('Shape of data tensor:', data.shape)\n",
    "print('Shape of label tensor:', labels.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "indices = np.arange(data.shape[0])\n",
    "np.random.shuffle(indices)\n",
    "data = data[indices]\n",
    "labels = labels[indices]\n",
    "nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train = data[:-nb_validation_samples]\n",
    "y_train = labels[:-nb_validation_samples]\n",
    "x_val = data[-nb_validation_samples:]\n",
    "y_val = labels[-nb_validation_samples:]\n",
    "\n",
    "print('Number of entries in each category:')\n",
    "print(\"Training:\\n\",y_train.sum(axis=0))\n",
    "print(\"Validation:\\n\",y_val.sum(axis=0))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preparing the Embedding layer\n",
    "Compute an index mapping words to known embeddings, by parsing the data dump of pre-trained embeddings: [GloVe](https://nlp.stanford.edu/projects/glove/) vectors from Stanford NLP. For new words, a \"randomised vector\" will be created."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "EMBEDDING_DIM = 100\n",
    "GLOVE_DIR = \"dataset/glove/glove.twitter.27B.100d.txt\"\n",
    "embeddings_index = {}\n",
    "f = open(GLOVE_DIR)\n",
    "print(\"Loading GloVe from:\",GLOVE_DIR,\"...\",end=\"\")\n",
    "for line in f:\n",
    "    values = line.split()\n",
    "    word = values[0]\n",
    "    coefs = np.asarray(values[1:], dtype='float32')\n",
    "    embeddings_index[word] = coefs\n",
    "f.close()\n",
    "print(\"Done.\\nProceeding with Embedding Matrix...\")\n",
    "embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))\n",
    "for word, i in word_index.items():\n",
    "    embedding_vector = embeddings_index.get(word)\n",
    "    if embedding_vector is not None:\n",
    "        # words not found in embedding index will be all-zeros.\n",
    "        embedding_matrix[i] = embedding_vector\n",
    "print(\"Completed!\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "After computing our embedding matrix, load this embedding matrix into an `Embedding` layer. Toggle `trainable=False` to prevent the weights from being updated during training."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding_layer = Embedding(len(word_index) + 1,\n",
    "                            EMBEDDING_DIM,\n",
    "                            weights=[embedding_matrix],\n",
    "                            input_length=MAX_SEQUENCE_LENGTH,\n",
    "                            trainable=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Simplified 1D CNN\n",
    "[Reference](https://github.com/richliao/textClassifier), [Source](https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html) and [Notes](http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') # 1000\n",
    "embedded_sequences = embedding_layer(sequence_input)\n",
    "l_cov1= Conv1D(128, 3, activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedded_sequences)\n",
    "l_cov2= Conv1D(128, 3, activation='relu',kernel_regularizer=regularizers.l2(0.01))(l_cov1)\n",
    "l_pool1 = MaxPooling1D(3)(l_cov2)\n",
    "#l_drop1 = Dropout(0.2)(l_pool1)\n",
    "l_cov3 = Conv1D(128, 3, activation='relu',kernel_regularizer=regularizers.l2(0.01))(l_pool1)\n",
    "l_pool4 = MaxPooling1D(6)(l_cov3) # global max pooling\n",
    "l_flat = Flatten()(l_pool4)\n",
    "l_dense = Dense(128, activation='relu')(l_flat)\n",
    "preds = Dense(4, activation='softmax')(l_dense) # 4 categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "model = Model(sequence_input, preds)\n",
    "adadelta = optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0)\n",
    "model.compile(loss='categorical_crossentropy',\n",
    "              optimizer=adadelta,\n",
    "              metrics=['acc'])\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def step_cyclic(epoch):\n",
    "    try:\n",
    "        if epoch%11==0:\n",
    "            return float(2)\n",
    "        elif epoch%3==0:\n",
    "            return float(1.1)\n",
    "        elif epoch%7==0:\n",
    "            return float(0.6)\n",
    "        else:\n",
    "            return float(1.0)\n",
    "    except:\n",
    "        return float(1.0)\n",
    "        \n",
    "tensorboard = callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=256, write_grads=True , write_graph=True)\n",
    "model_checkpoints = callbacks.ModelCheckpoint(\"checkpoints\", monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1)\n",
    "lr_schedule = callbacks.LearningRateScheduler(step_cyclic)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Training Progress:\")\n",
    "model.fit(x_train, y_train, validation_data=(x_val, y_val),\n",
    "          epochs=200, batch_size=128,\n",
    "          callbacks=[tensorboard, model_checkpoints, lr_schedule])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
